1 package org.apache.lucene.index;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.io.IOException;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashMap;
24 import java.util.HashSet;
25 import java.util.Iterator;
26 import java.util.Random;
27
28 import org.apache.lucene.analysis.MockAnalyzer;
29 import org.apache.lucene.codecs.Codec;
30 import org.apache.lucene.codecs.FieldsConsumer;
31 import org.apache.lucene.codecs.FieldsProducer;
32 import org.apache.lucene.document.Document;
33 import org.apache.lucene.document.Field.Store;
34 import org.apache.lucene.document.StringField;
35 import org.apache.lucene.search.DocIdSetIterator;
36 import org.apache.lucene.search.IndexSearcher;
37 import org.apache.lucene.search.Query;
38 import org.apache.lucene.search.ScoreDoc;
39 import org.apache.lucene.store.Directory;
40 import org.apache.lucene.util.Bits;
41 import org.apache.lucene.util.BytesRef;
42 import org.apache.lucene.util.IOUtils;
43 import org.apache.lucene.util.InfoStream;
44 import org.apache.lucene.util.LuceneTestCase;
45 import org.apache.lucene.util.StringHelper;
46 import org.apache.lucene.util.TestUtil;
47 import org.apache.lucene.util.Version;
48 import org.junit.BeforeClass;
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68 public class TestCodecs extends LuceneTestCase {
69 private static String[] fieldNames = new String[] {"one", "two", "three", "four"};
70
71 private static int NUM_TEST_ITER;
72 private final static int NUM_TEST_THREADS = 3;
73 private final static int NUM_FIELDS = 4;
74 private final static int NUM_TERMS_RAND = 50;
75 private final static int DOC_FREQ_RAND = 500;
76 private final static int TERM_DOC_FREQ_RAND = 20;
77
78 @BeforeClass
79 public static void beforeClass() {
80 NUM_TEST_ITER = atLeast(20);
81 }
82
83 class FieldData implements Comparable<FieldData> {
84 final FieldInfo fieldInfo;
85 final TermData[] terms;
86 final boolean omitTF;
87 final boolean storePayloads;
88
89 public FieldData(final String name, final FieldInfos.Builder fieldInfos, final TermData[] terms, final boolean omitTF, final boolean storePayloads) {
90 this.omitTF = omitTF;
91 this.storePayloads = storePayloads;
92
93 fieldInfo = fieldInfos.getOrAdd(name);
94 if (omitTF) {
95 fieldInfo.setIndexOptions(IndexOptions.DOCS);
96 } else {
97 fieldInfo.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
98 }
99 if (storePayloads) {
100 fieldInfo.setStorePayloads();
101 }
102 this.terms = terms;
103 for(int i=0;i<terms.length;i++)
104 terms[i].field = this;
105
106 Arrays.sort(terms);
107 }
108
109 @Override
110 public int compareTo(final FieldData other) {
111 return fieldInfo.name.compareTo(other.fieldInfo.name);
112 }
113 }
114
115 class PositionData {
116 int pos;
117 BytesRef payload;
118
119 PositionData(final int pos, final BytesRef payload) {
120 this.pos = pos;
121 this.payload = payload;
122 }
123 }
124
125 class TermData implements Comparable<TermData> {
126 String text2;
127 final BytesRef text;
128 int[] docs;
129 PositionData[][] positions;
130 FieldData field;
131
132 public TermData(final String text, final int[] docs, final PositionData[][] positions) {
133 this.text = new BytesRef(text);
134 this.text2 = text;
135 this.docs = docs;
136 this.positions = positions;
137 }
138
139 @Override
140 public int compareTo(final TermData o) {
141 return text.compareTo(o.text);
142 }
143 }
144
145 final private static String SEGMENT = "0";
146
147 TermData[] makeRandomTerms(final boolean omitTF, final boolean storePayloads) {
148 final int numTerms = 1+random().nextInt(NUM_TERMS_RAND);
149
150 final TermData[] terms = new TermData[numTerms];
151
152 final HashSet<String> termsSeen = new HashSet<>();
153
154 for(int i=0;i<numTerms;i++) {
155
156
157 String text2;
158 while(true) {
159 text2 = TestUtil.randomUnicodeString(random());
160 if (!termsSeen.contains(text2) && !text2.endsWith(".")) {
161 termsSeen.add(text2);
162 break;
163 }
164 }
165
166 final int docFreq = 1+random().nextInt(DOC_FREQ_RAND);
167 final int[] docs = new int[docFreq];
168 PositionData[][] positions;
169
170 if (!omitTF)
171 positions = new PositionData[docFreq][];
172 else
173 positions = null;
174
175 int docID = 0;
176 for(int j=0;j<docFreq;j++) {
177 docID += TestUtil.nextInt(random(), 1, 10);
178 docs[j] = docID;
179
180 if (!omitTF) {
181 final int termFreq = 1+random().nextInt(TERM_DOC_FREQ_RAND);
182 positions[j] = new PositionData[termFreq];
183 int position = 0;
184 for(int k=0;k<termFreq;k++) {
185 position += TestUtil.nextInt(random(), 1, 10);
186
187 final BytesRef payload;
188 if (storePayloads && random().nextInt(4) == 0) {
189 final byte[] bytes = new byte[1+random().nextInt(5)];
190 for(int l=0;l<bytes.length;l++) {
191 bytes[l] = (byte) random().nextInt(255);
192 }
193 payload = new BytesRef(bytes);
194 } else {
195 payload = null;
196 }
197
198 positions[j][k] = new PositionData(position, payload);
199 }
200 }
201 }
202
203 terms[i] = new TermData(text2, docs, positions);
204 }
205
206 return terms;
207 }
208
209 public void testFixedPostings() throws Throwable {
210 final int NUM_TERMS = 100;
211 final TermData[] terms = new TermData[NUM_TERMS];
212 for(int i=0;i<NUM_TERMS;i++) {
213 final int[] docs = new int[] {i};
214 final String text = Integer.toString(i, Character.MAX_RADIX);
215 terms[i] = new TermData(text, docs, null);
216 }
217
218 final FieldInfos.Builder builder = new FieldInfos.Builder();
219
220 final FieldData field = new FieldData("field", builder, terms, true, false);
221 final FieldData[] fields = new FieldData[] {field};
222 final FieldInfos fieldInfos = builder.finish();
223 final Directory dir = newDirectory();
224 Codec codec = Codec.getDefault();
225 final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, SEGMENT, 10000, false, codec, Collections.<String,String>emptyMap(), StringHelper.randomId(), new HashMap<String,String>());
226
227 this.write(si, fieldInfos, dir, fields);
228 final FieldsProducer reader = codec.postingsFormat().fieldsProducer(new SegmentReadState(dir, si, fieldInfos, newIOContext(random())));
229
230 final Iterator<String> fieldsEnum = reader.iterator();
231 String fieldName = fieldsEnum.next();
232 assertNotNull(fieldName);
233 final Terms terms2 = reader.terms(fieldName);
234 assertNotNull(terms2);
235
236 final TermsEnum termsEnum = terms2.iterator();
237
238 PostingsEnum postingsEnum = null;
239 for(int i=0;i<NUM_TERMS;i++) {
240 final BytesRef term = termsEnum.next();
241 assertNotNull(term);
242 assertEquals(terms[i].text2, term.utf8ToString());
243
244
245
246
247 for(int iter=0;iter<2;iter++) {
248 postingsEnum = TestUtil.docs(random(), termsEnum, postingsEnum, PostingsEnum.NONE);
249 assertEquals(terms[i].docs[0], postingsEnum.nextDoc());
250 assertEquals(DocIdSetIterator.NO_MORE_DOCS, postingsEnum.nextDoc());
251 }
252 }
253 assertNull(termsEnum.next());
254
255 for(int i=0;i<NUM_TERMS;i++) {
256 assertEquals(termsEnum.seekCeil(new BytesRef(terms[i].text2)), TermsEnum.SeekStatus.FOUND);
257 }
258
259 assertFalse(fieldsEnum.hasNext());
260 reader.close();
261 dir.close();
262 }
263
264 public void testRandomPostings() throws Throwable {
265 final FieldInfos.Builder builder = new FieldInfos.Builder();
266
267 final FieldData[] fields = new FieldData[NUM_FIELDS];
268 for(int i=0;i<NUM_FIELDS;i++) {
269 final boolean omitTF = 0==(i%3);
270 final boolean storePayloads = 1==(i%3);
271 fields[i] = new FieldData(fieldNames[i], builder, this.makeRandomTerms(omitTF, storePayloads), omitTF, storePayloads);
272 }
273
274 final Directory dir = newDirectory();
275 final FieldInfos fieldInfos = builder.finish();
276
277 if (VERBOSE) {
278 System.out.println("TEST: now write postings");
279 }
280
281 Codec codec = Codec.getDefault();
282 final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, SEGMENT, 10000, false, codec, Collections.<String,String>emptyMap(), StringHelper.randomId(), new HashMap<String,String>());
283 this.write(si, fieldInfos, dir, fields);
284
285 if (VERBOSE) {
286 System.out.println("TEST: now read postings");
287 }
288 final FieldsProducer terms = codec.postingsFormat().fieldsProducer(new SegmentReadState(dir, si, fieldInfos, newIOContext(random())));
289
290 final Verify[] threads = new Verify[NUM_TEST_THREADS-1];
291 for(int i=0;i<NUM_TEST_THREADS-1;i++) {
292 threads[i] = new Verify(si, fields, terms);
293 threads[i].setDaemon(true);
294 threads[i].start();
295 }
296
297 new Verify(si, fields, terms).run();
298
299 for(int i=0;i<NUM_TEST_THREADS-1;i++) {
300 threads[i].join();
301 assert !threads[i].failed;
302 }
303
304 terms.close();
305 dir.close();
306 }
307
308 private class Verify extends Thread {
309 final Fields termsDict;
310 final FieldData[] fields;
311 final SegmentInfo si;
312 volatile boolean failed;
313
314 Verify(final SegmentInfo si, final FieldData[] fields, final Fields termsDict) {
315 this.fields = fields;
316 this.termsDict = termsDict;
317 this.si = si;
318 }
319
320 @Override
321 public void run() {
322 try {
323 this._run();
324 } catch (final Throwable t) {
325 failed = true;
326 throw new RuntimeException(t);
327 }
328 }
329
330 private void verifyDocs(final int[] docs, final PositionData[][] positions, final PostingsEnum postingsEnum, final boolean doPos) throws Throwable {
331 for(int i=0;i<docs.length;i++) {
332 final int doc = postingsEnum.nextDoc();
333 assertTrue(doc != DocIdSetIterator.NO_MORE_DOCS);
334 assertEquals(docs[i], doc);
335 if (doPos) {
336 this.verifyPositions(positions[i], postingsEnum);
337 }
338 }
339 assertEquals(DocIdSetIterator.NO_MORE_DOCS, postingsEnum.nextDoc());
340 }
341
342 byte[] data = new byte[10];
343
344 private void verifyPositions(final PositionData[] positions, final PostingsEnum posEnum) throws Throwable {
345 for(int i=0;i<positions.length;i++) {
346 final int pos = posEnum.nextPosition();
347 assertEquals(positions[i].pos, pos);
348 if (positions[i].payload != null) {
349 assertNotNull(posEnum.getPayload());
350 if (random().nextInt(3) < 2) {
351
352 final BytesRef otherPayload = posEnum.getPayload();
353 assertTrue("expected=" + positions[i].payload.toString() + " got=" + otherPayload.toString(), positions[i].payload.equals(otherPayload));
354 }
355 } else {
356 assertNull(posEnum.getPayload());
357 }
358 }
359 }
360
361 public void _run() throws Throwable {
362
363 for(int iter=0;iter<NUM_TEST_ITER;iter++) {
364 final FieldData field = fields[random().nextInt(fields.length)];
365 final TermsEnum termsEnum = termsDict.terms(field.fieldInfo.name).iterator();
366
367 int upto = 0;
368
369 while(true) {
370 final BytesRef term = termsEnum.next();
371 if (term == null) {
372 break;
373 }
374 final BytesRef expected = new BytesRef(field.terms[upto++].text2);
375 assertTrue("expected=" + expected + " vs actual " + term, expected.bytesEquals(term));
376 }
377 assertEquals(upto, field.terms.length);
378
379
380 TermData term = field.terms[random().nextInt(field.terms.length)];
381 TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(term.text2));
382 assertEquals(status, TermsEnum.SeekStatus.FOUND);
383 assertEquals(term.docs.length, termsEnum.docFreq());
384 if (field.omitTF) {
385 this.verifyDocs(term.docs, term.positions, TestUtil.docs(random(), termsEnum, null, PostingsEnum.NONE), false);
386 } else {
387 this.verifyDocs(term.docs, term.positions, termsEnum.postings(null, PostingsEnum.ALL), true);
388 }
389
390
391 final int idx = random().nextInt(field.terms.length);
392 term = field.terms[idx];
393 boolean success = false;
394 try {
395 termsEnum.seekExact(idx);
396 success = true;
397 } catch (UnsupportedOperationException uoe) {
398
399 }
400 if (success) {
401 assertEquals(status, TermsEnum.SeekStatus.FOUND);
402 assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2)));
403 assertEquals(term.docs.length, termsEnum.docFreq());
404 if (field.omitTF) {
405 this.verifyDocs(term.docs, term.positions, TestUtil.docs(random(), termsEnum, null, PostingsEnum.NONE), false);
406 } else {
407 this.verifyDocs(term.docs, term.positions, termsEnum.postings(null, PostingsEnum.ALL), true);
408 }
409 }
410
411
412 if (VERBOSE) {
413 System.out.println("TEST: seek non-exist terms");
414 }
415 for(int i=0;i<100;i++) {
416 final String text2 = TestUtil.randomUnicodeString(random()) + ".";
417 status = termsEnum.seekCeil(new BytesRef(text2));
418 assertTrue(status == TermsEnum.SeekStatus.NOT_FOUND ||
419 status == TermsEnum.SeekStatus.END);
420 }
421
422
423 if (VERBOSE) {
424 System.out.println("TEST: seek terms backwards");
425 }
426 for(int i=field.terms.length-1;i>=0;i--) {
427 assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(field.terms[i].text2)));
428 assertEquals(field.terms[i].docs.length, termsEnum.docFreq());
429 }
430
431
432 for(int i=field.terms.length-1;i>=0;i--) {
433 try {
434 termsEnum.seekExact(i);
435 assertEquals(field.terms[i].docs.length, termsEnum.docFreq());
436 assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2)));
437 } catch (UnsupportedOperationException uoe) {
438 }
439 }
440
441
442 status = termsEnum.seekCeil(new BytesRef(""));
443 assertNotNull(status);
444
445
446
447 assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[0].text2)));
448
449
450 termsEnum.seekCeil(new BytesRef(""));
451 upto = 0;
452 do {
453 term = field.terms[upto];
454 if (random().nextInt(3) == 1) {
455 final PostingsEnum postings;
456 if (!field.omitTF) {
457
458
459 postings = termsEnum.postings(null, PostingsEnum.ALL);
460 } else {
461 postings = TestUtil.docs(random(), termsEnum, null, PostingsEnum.FREQS);
462 }
463 assertNotNull(postings);
464 int upto2 = -1;
465 boolean ended = false;
466 while(upto2 < term.docs.length-1) {
467
468 final int left = term.docs.length-upto2;
469 int doc;
470 if (random().nextInt(3) == 1 && left >= 1) {
471 final int inc = 1+random().nextInt(left-1);
472 upto2 += inc;
473 if (random().nextInt(2) == 1) {
474 doc = postings.advance(term.docs[upto2]);
475 assertEquals(term.docs[upto2], doc);
476 } else {
477 doc = postings.advance(1+term.docs[upto2]);
478 if (doc == DocIdSetIterator.NO_MORE_DOCS) {
479
480 assert upto2 == term.docs.length-1;
481 ended = true;
482 break;
483 } else {
484
485 assert upto2 < term.docs.length-1;
486 if (doc >= term.docs[1+upto2]) {
487 upto2++;
488 }
489 }
490 }
491 } else {
492 doc = postings.nextDoc();
493 assertTrue(doc != -1);
494 upto2++;
495 }
496 assertEquals(term.docs[upto2], doc);
497 if (!field.omitTF) {
498 assertEquals(term.positions[upto2].length, postings.freq());
499 if (random().nextInt(2) == 1) {
500 this.verifyPositions(term.positions[upto2], postings);
501 }
502 }
503 }
504
505 if (!ended) {
506 assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
507 }
508 }
509 upto++;
510
511 } while (termsEnum.next() != null);
512
513 assertEquals(upto, field.terms.length);
514 }
515 }
516 }
517
518 private static class DataFields extends Fields {
519 private final FieldData[] fields;
520
521 public DataFields(FieldData[] fields) {
522
523 this.fields = fields;
524 }
525
526 @Override
527 public Iterator<String> iterator() {
528 return new Iterator<String>() {
529 int upto = -1;
530
531 @Override
532 public boolean hasNext() {
533 return upto+1 < fields.length;
534 }
535
536 @Override
537 public String next() {
538 upto++;
539 return fields[upto].fieldInfo.name;
540 }
541
542 @Override
543 public void remove() {
544 throw new UnsupportedOperationException();
545 }
546 };
547 }
548
549 @Override
550 public Terms terms(String field) {
551
552 for(FieldData fieldData : fields) {
553 if (fieldData.fieldInfo.name.equals(field)) {
554 return new DataTerms(fieldData);
555 }
556 }
557 return null;
558 }
559
560 @Override
561 public int size() {
562 return fields.length;
563 }
564 }
565
566 private static class DataTerms extends Terms {
567 final FieldData fieldData;
568
569 public DataTerms(FieldData fieldData) {
570 this.fieldData = fieldData;
571 }
572
573 @Override
574 public TermsEnum iterator() {
575 return new DataTermsEnum(fieldData);
576 }
577
578 @Override
579 public long size() {
580 throw new UnsupportedOperationException();
581 }
582
583 @Override
584 public long getSumTotalTermFreq() {
585 throw new UnsupportedOperationException();
586 }
587
588 @Override
589 public long getSumDocFreq() {
590 throw new UnsupportedOperationException();
591 }
592
593 @Override
594 public int getDocCount() {
595 throw new UnsupportedOperationException();
596 }
597
598 @Override
599 public boolean hasFreqs() {
600 return fieldData.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
601 }
602
603 @Override
604 public boolean hasOffsets() {
605 return fieldData.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
606 }
607
608 @Override
609 public boolean hasPositions() {
610 return fieldData.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
611 }
612
613 @Override
614 public boolean hasPayloads() {
615 return fieldData.fieldInfo.hasPayloads();
616 }
617 }
618
619 private static class DataTermsEnum extends TermsEnum {
620 final FieldData fieldData;
621 private int upto = -1;
622
623 public DataTermsEnum(FieldData fieldData) {
624 this.fieldData = fieldData;
625 }
626
627 @Override
628 public BytesRef next() {
629 upto++;
630 if (upto == fieldData.terms.length) {
631 return null;
632 }
633
634 return term();
635 }
636
637 @Override
638 public BytesRef term() {
639 return fieldData.terms[upto].text;
640 }
641
642 @Override
643 public SeekStatus seekCeil(BytesRef text) {
644
645 for(int i=0;i<fieldData.terms.length;i++) {
646 int cmp = fieldData.terms[i].text.compareTo(text);
647 if (cmp == 0) {
648 upto = i;
649 return SeekStatus.FOUND;
650 } else if (cmp > 0) {
651 upto = i;
652 return SeekStatus.NOT_FOUND;
653 }
654 }
655
656 return SeekStatus.END;
657 }
658
659 @Override
660 public void seekExact(long ord) {
661 throw new UnsupportedOperationException();
662 }
663
664 @Override
665 public long ord() {
666 throw new UnsupportedOperationException();
667 }
668
669 @Override
670 public int docFreq() {
671 throw new UnsupportedOperationException();
672 }
673
674 @Override
675 public long totalTermFreq() {
676 throw new UnsupportedOperationException();
677 }
678
679 @Override
680 public PostingsEnum postings(PostingsEnum reuse, int flags) {
681 return new DataPostingsEnum(fieldData.terms[upto]);
682 }
683
684 }
685
686 private static class DataPostingsEnum extends PostingsEnum {
687 final TermData termData;
688 int docUpto = -1;
689 int posUpto;
690
691 public DataPostingsEnum(TermData termData) {
692 this.termData = termData;
693 }
694
695 @Override
696 public long cost() {
697 throw new UnsupportedOperationException();
698 }
699
700 @Override
701 public int nextDoc() {
702 docUpto++;
703 if (docUpto == termData.docs.length) {
704 return NO_MORE_DOCS;
705 }
706 posUpto = -1;
707 return docID();
708 }
709
710 @Override
711 public int docID() {
712 return termData.docs[docUpto];
713 }
714
715 @Override
716 public int advance(int target) {
717
718 nextDoc();
719 while (docID() < target) {
720 nextDoc();
721 }
722
723 return docID();
724 }
725
726 @Override
727 public int freq() {
728 return termData.positions[docUpto].length;
729 }
730
731 @Override
732 public int nextPosition() {
733 posUpto++;
734 return termData.positions[docUpto][posUpto].pos;
735 }
736
737 @Override
738 public BytesRef getPayload() {
739 return termData.positions[docUpto][posUpto].payload;
740 }
741
742 @Override
743 public int startOffset() {
744 throw new UnsupportedOperationException();
745 }
746
747 @Override
748 public int endOffset() {
749 throw new UnsupportedOperationException();
750 }
751 }
752
753 private void write(SegmentInfo si, final FieldInfos fieldInfos, final Directory dir, final FieldData[] fields) throws Throwable {
754
755 final Codec codec = si.getCodec();
756 final SegmentWriteState state = new SegmentWriteState(InfoStream.getDefault(), dir, si, fieldInfos, null, newIOContext(random()));
757
758 Arrays.sort(fields);
759 FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(state);
760 boolean success = false;
761 try {
762 consumer.write(new DataFields(fields));
763 success = true;
764 } finally {
765 if (success) {
766 IOUtils.close(consumer);
767 } else {
768 IOUtils.closeWhileHandlingException(consumer);
769 }
770 }
771 }
772
773 public void testDocsOnlyFreq() throws Exception {
774
775
776 Directory dir = newDirectory();
777 Random random = random();
778 IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random)));
779
780 int numDocs = atLeast(random, 50);
781 for (int i = 0; i < numDocs; i++) {
782 Document doc = new Document();
783 doc.add(new StringField("f", "doc", Store.NO));
784 writer.addDocument(doc);
785 }
786 writer.close();
787
788 Term term = new Term("f", new BytesRef("doc"));
789 DirectoryReader reader = DirectoryReader.open(dir);
790 for (LeafReaderContext ctx : reader.leaves()) {
791 PostingsEnum de = ctx.reader().postings(term);
792 while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
793 assertEquals("wrong freq for doc " + de.docID(), 1, de.freq());
794 }
795 }
796 reader.close();
797
798 dir.close();
799 }
800
801 }